Data Processing

rm(list=ls())
library("reshape2")
library("ggplot2")
library("NMF")  # for heatmaps
library("ggmap")  # for map plot 
library("gridExtra")

#library(plyr)  # to manipulate tables

Importing Data

Let’s import the newest tHPI data.

data.csv <- read.csv("/Users/bian/sandbox/thpi_us.csv")
data.tabular <- data.csv
head(data.tabular)
##           city January February March April May June July August September
## 1     New York     243      245   297   377 410  372  317    320       430
## 2  Los Angeles     197      196   203   219 219  227  252    245       220
## 3      Chicago     154      163   213   259 333  311  299    267       293
## 4       Dallas     148      143   154   158 150  148  145    147       157
## 5 Philadelphia     172      189   207   228 246  244  200    198       247
## 6      Houston     149      157   157   161 173  146  149    143       150
##   October      lat        lon
## 1     430 40.71278  -74.00594
## 2     219 34.05223 -118.24368
## 3     307 41.83690  -87.68470
## 4     163 32.77670  -96.79700
## 5     241 39.95000  -75.16670
## 6     154 29.76040  -95.36980

Let’s now reshape our input data to a more friendly format.

data <- melt(data.tabular[,c(1:11)], id="city")
data <- setNames(data[c("city", "variable", "value")], c("city", "month", "price"))
head(data)
##           city   month price
## 1     New York January   243
## 2  Los Angeles January   197
## 3      Chicago January   154
## 4       Dallas January   148
## 5 Philadelphia January   172
## 6      Houston January   149

We remove the $-sign and turn prices into numeric values.

data$price <- as.numeric(gsub("[^[:digit:]]",'',data$price))
head(data)
##           city   month price
## 1     New York January   243
## 2  Los Angeles January   197
## 3      Chicago January   154
## 4       Dallas January   148
## 5 Philadelphia January   172
## 6      Houston January   149
summary(data)
##       city          month         price      
##  Atlanta: 10   January : 25   Min.   : 98.0  
##  Boston : 10   February: 25   1st Qu.:150.0  
##  Chicago: 10   March   : 25   Median :178.5  
##  Dallas : 10   April   : 25   Mean   :202.8  
##  Denver : 10   May     : 25   3rd Qu.:234.8  
##  Detroit: 10   June    : 25   Max.   :454.0  
##  (Other):190   (Other) :100

Let’s convert months to Date format

data$date <-as.Date(paste("01",data$month,"15",sep=""),"%d%B%Y") 

EDA

Let’s illustrate the trends

ggplot(data, aes(x=price)) + geom_density()

ggplot(data, aes(x=price,color=month)) + geom_density()

ggplot(data, aes(x=price,color=city)) + geom_density()

ggplot(data, aes(x=month,y=price)) + geom_boxplot() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

ggplot(data, aes(x=city,y=price)) + geom_boxplot() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

ggplot(data, aes(x=date,y=price)) + geom_point() + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

ggplot(data, aes(x=date,y=price,color=city)) + geom_point() + geom_smooth(se=FALSE)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

heatmap

For ploting the heatmap, we have to use the original data.tabular which has a matrix format.

nba <- data.tabular
nba <- nba[,2:11]

nba <- sapply(nba,function(x) {as.numeric(gsub("[^[:digit:]]",'',x))})
row.names(nba) <- data.tabular$city

nba_matrix <- data.matrix(nba)
dev.off()
## null device 
##           1
nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = heat.colors(256), scale="row", margins=c(6,10))

Let’s do some heatmaps. First the original one, then a normalized one based on cities and last a normalized one by months.

aheatmap(nba_matrix, Rowv=FALSE, Colv=FALSE, fontsize=5, cexRow=2, cexCol=2)

# normalized based on cities
aheatmap(nba_matrix, color = "-RdBu:50", scale = "column", Rowv=FALSE, Colv=FALSE,fontsize=5, cexRow=2, cexCol=2)

# normalized based on months
aheatmap(nba_matrix, color = "-RdBu:50", scale = "row", Rowv=FALSE, Colv=FALSE,fontsize=3, cexRow=3, cexCol=2)

Using Latitudal Information

# creating a sample data.frame with your lat/lon points
lon <- data.tabular$lon
lat <- data.tabular$lat
df <- as.data.frame(cbind(lon,lat))

# getting the map
mapgilbert <- get_map(location = c(lon = mean(df$lon), lat = mean(df$lat)), zoom = 4,
                      maptype = "terrain", scale = 1)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=36.756037,-96.415081&zoom=4&size=640x640&scale=1&maptype=terrain&language=en-EN&sensor=false
# plotting the map with some points on it
data.tabular$sept.price <- as.numeric(gsub("[^[:digit:]]",'',data.tabular$September))
data.tabular$oct.price <- as.numeric(gsub("[^[:digit:]]",'',data.tabular$October))
data.tabular$aug.price <- as.numeric(gsub("[^[:digit:]]",'',data.tabular$August))

ggmap(mapgilbert) + geom_point(data = data.tabular, aes(x = lon, y = lat, fill = oct.price,label=city), size = 5, shape = 21)

ggmap(mapgilbert) +
  geom_point(data = data.tabular, aes(x = lon, y = lat, fill = sept.price),size = 5, shape = 21) + 
  geom_point(data = data.tabular, aes(x = lon, y = lat+.7, fill = oct.price),size = 5, shape = 22) + 
  geom_point(data = data.tabular, aes(x = lon+.7, y = lat, fill = aug.price),size = 5, shape = 24) + 
#  scale_fill_gradient2(low = "#0000FF", high ="#FF0000", midpoint = mean(data.tabular$oct.price))
  scale_fill_gradient2(low = "#0000FF", high ="#FF0000", midpoint = 250)

ggplot(data = data.tabular, aes(x = lon, y = lat,label=city)) + 
  geom_point(aes(fill = oct.price),size = 5, shape = 21) + 
  geom_text(aes(label=city),hjust=0, vjust=1.5) + 
  scale_fill_gradient2(low = "#0000FF", high ="#FF0000", midpoint = 250)

Categorizing the cities

We can’t continue with a bunch of numbers. Let’s extract specific statistics which we can use for further studies. WE categorize cities based on their price average, and their price trend and later will add information about their climate, population, number of hotels, etc.

# getting the price statistics

data.summary = data.frame(city=unique(data$city))  # a data frame for summary of data

d <- aggregate(data$price, by=list(city=data$city), FUN=mean)  # temporary mean 
data.summary$price.mean <- sapply(data.summary$city,FUN=function(x){d[d$city==x,]$x})

d <- aggregate(data$price, by=list(city=data$city), FUN=sd)  # temporary sd
data.summary$price.sd <- sapply(data.summary$city,FUN=function(x){d[d$city==x,]$x})

d <- aggregate(data$price, by=list(city=data$city), FUN=range)  # temporary range
data.summary$price.range <- sapply(data.summary$city,FUN=function(x){d[d$city==x,]$x[2] - d[d$city==x,]$x[1]})
head(data.summary)
##           city price.mean  price.sd price.range
## 1     New York      344.1 70.345891         187
## 2  Los Angeles      219.7 18.541545          56
## 3      Chicago      259.9 62.939918         179
## 4       Dallas      151.3  6.429965          20
## 5 Philadelphia      217.2 27.336585          75
## 6      Houston      153.9  8.685237          30
# assinging price classes 
data.summary$price.mean.class <- cut(data.summary$price.mean,breaks=3)
data.summary$price.sd.class <- cut(data.summary$price.sd,breaks=2)
data.summary$price.range.class <- cut(data.summary$price.range,breaks=2)
head(data.summary)
##           city price.mean  price.sd price.range price.mean.class
## 1     New York      344.1 70.345891         187        (284,359]
## 2  Los Angeles      219.7 18.541545          56        (210,284]
## 3      Chicago      259.9 62.939918         179        (210,284]
## 4       Dallas      151.3  6.429965          20        (135,210]
## 5 Philadelphia      217.2 27.336585          75        (210,284]
## 6      Houston      153.9  8.685237          30        (135,210]
##   price.sd.class price.range.class
## 1    (44.5,82.9]         (126,235]
## 2    (6.12,44.5]        (17.8,126]
## 3    (44.5,82.9]         (126,235]
## 4    (6.12,44.5]        (17.8,126]
## 5    (6.12,44.5]        (17.8,126]
## 6    (6.12,44.5]        (17.8,126]
# let's check the price distributions
ggplot(data=data.summary, aes(x=price.mean.class,y=price.mean)) + geom_boxplot()

ggplot(data=data.summary, aes(x=price.sd.class,y=price.sd)) + geom_boxplot()

ggplot(data=data.summary, aes(x=price.range.class,y=price.range)) + geom_boxplot()

Now, we bring the GPS data to data.summary.

data.summary$lat <- sapply(data.summary$city,function(x) {as.numeric(data.tabular[data.tabular$city == x,]$lat[1])})
data.summary$lon <- sapply(data.summary$city,function(x) {as.numeric(data.tabular[data.tabular$city == x,]$lon[1])})
ggplot(data=data.summary, aes(x=lon,y=lat,color=price.mean.class)) + geom_point(aes(shape=price.sd.class),size=4) + geom_text(aes(label=city),hjust=0, vjust=0) 

From the results of last plot we understand that not the low price cities don’t have much variance in their price. Let’s check it here

data.summary$price.mean
##  [1] 344.1 219.7 259.9 151.3 217.2 153.9 294.5 195.3 174.6 358.7 297.8
## [12] 165.3 144.4 185.9 150.5 138.9 135.0 138.3 194.7 214.9 153.3 249.1
## [23] 196.8 137.1 198.1
data.summary$class.price[data.summary$price.mean <= 210] <- "cheap" 
data.summary$class.price[data.summary$price.mean <= 284 & data.summary$price.mean > 210] <- "moderate" 
data.summary$class.price[data.summary$price.mean > 284] <- "expensive" 

#data.summary$class.stability[data.summary$price.sd <= 44.5] <- "stable" 
#data.summary$class.stability[data.summary$price.sd > 44.5] <- "unstable" 

data.summary$class.stability[data.summary$price.range <= 100] <- "stable" 
data.summary$class.stability[data.summary$price.range > 100] <- "unstable" 

ggplot(subset(data,city %in% data.summary$city[data.summary$class.price == "cheap"]), aes(x=date,y=price,color=city)) +
  geom_point() + geom_smooth(se=FALSE) + 
  ggtitle("Cheap Cities") + 
  scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

ggplot(subset(data,city %in% data.summary$city[data.summary$class.price == "moderate"]), aes(x=date,y=price,color=city)) + 
  geom_point() + geom_smooth(se=FALSE) + 
  ggtitle("Moderate Cities") + 
  scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

ggplot(subset(data,city %in% data.summary$city[data.summary$class.price == "expensive"]), aes(x=date,y=price,color=city)) + 
  geom_point() + geom_smooth(se=FALSE) + 
  ggtitle("Expensive Cities") + 
  scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

ggplot(subset(data,city %in% data.summary$city[data.summary$class.stability == "stable"]), aes(x=date,y=price,color=city)) + 
  geom_point() + geom_smooth(se=FALSE) + 
  ggtitle("Stable Cities") + 
  scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

ggplot(subset(data,city %in% data.summary$city[data.summary$class.stability == "unstable"]), aes(x=date,y=price,color=city)) + 
  geom_point() + geom_smooth(se=FALSE) + 
  ggtitle("Unstable Cities") + 
  scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

Using the latter diagrm we make the following new categories.

data$trend[data$city %in% c("Phoenix","Miami")] <- "updownup"
data$trend[data$city %in% c("Boston","Meanneapolis","Chicago","New York")] <- "upupdownup"
data$trend[data$city %in% c("San Francisco")] <- "upupup"
data$trend[data$city %in% c("Seattle")] <- "upupupdown"

p1 <- ggplot(subset(data, trend=="updownup"), aes(x=date,y=price,color=city)) + 
  geom_point() + 
  geom_smooth(se=FALSE,aes(color=city)) + 
  ggtitle("three different trends") + 
  scale_y_continuous(limits = c(90,500))
p2 <- ggplot(subset(data, trend=="upupdownup"), aes(x=date,y=price,color=city)) + 
  geom_point() + 
  geom_smooth(se=FALSE,aes(color=city)) + 
  ggtitle("three different trends") + 
  scale_y_continuous(limits = c(90,500))
p3 <- ggplot(subset(data, trend=="upupup"), aes(x=date,y=price,color=city)) + 
  geom_point() + 
  geom_smooth(se=FALSE,aes(color=city)) + 
  ggtitle("three different trends") + 
  scale_y_continuous(limits = c(90,500))
p4 <- ggplot(subset(data, trend=="upupupdown"), aes(x=date,y=price,color=city)) + 
  geom_point() + 
  geom_smooth(se=FALSE,aes(color=city)) + 
  ggtitle("three different trends") + 
  scale_y_continuous(limits = c(90,500))

grid.arrange(p1,p2,p3,p4)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

Modeling

It seems that every city is following a semi-sinusoidal trend (Probably it’s following the weather conditions). Let’s find the picks!

Estimating October Prices (Test)

p <- lapply(data.summary$city,function(the_city) {
  price.train = get.predictors(1:9)
  price.train$y = data$price[data$city==the_city][1:9]
  price.trend = get.predictors(1:10)
  price.trend$y = data$price[data$city==the_city][1:10]
  
  
  price.prediction <- get.predictions(price.train,10)
  ggplot(price.trend, aes(x=t,y=y)) + 
      geom_point() + 
      geom_point(data=price.prediction, aes(x=t,y=y)) + 
      geom_line(data=price.prediction, aes(x=t,y=y),color="red") + 
      geom_line() +
      ggtitle(the_city)
      
})

Good Predictions

For some of the cities we have good predictions for October

do.call(grid.arrange, c(p[c(3,4,19,21,22,17)], list(ncol=3)))

Bad Predictions

For some other cities, the predictions don’t work that well. This can be because of unknwon disturbances that our model doesn’t know.

do.call(grid.arrange, c(p[c(1,12,20,23)], list(ncol=2)))

## Calculating the Estimation Error [todo][TODO]

Estimating November and December Prices (To be Verified)

# plotting the predictions
p <- lapply(data.summary$city,function(the_city) {
  price.train = get.predictors(1:10)
  price.train$y = data$price[data$city==the_city][1:10]
  price.trend = get.predictors(1:10)
  price.trend$y = data$price[data$city==the_city][1:10]
  price.prediction <- get.predictions(price.train,12)
  
  ggplot(price.trend, aes(x=t,y=y)) + 
      geom_point() + 
      geom_point(data=price.prediction, aes(x=t,y=y)) + 
      geom_line(data=price.prediction, aes(x=t,y=y),color="red") + 
      geom_line() +
      ggtitle(the_city)
      
})
do.call(grid.arrange, c(p[c(1:9)], list(ncol=3)))

do.call(grid.arrange, c(p[c(10:18)], list(ncol=3)))

do.call(grid.arrange, c(p[c(19:25)], list(ncol=3)))

Filling in the prices in the matrix

data.to.be.verified <- data.csv
for (the_city in data.summary$city) {
  price.train = get.predictors(1:10)
  price.train$y = data$price[data$city==the_city][1:10]
  price.prediction <- get.predictions(price.train,12)
  data.to.be.verified$November[data.tabular$city==the_city] <- round(price.prediction$y[11])
  data.to.be.verified$December[data.tabular$city==the_city] <- round(price.prediction$y[12])
}

data.to.be.verified[c(1,7:11,14,15)]
##               city June July August September October November December
## 1         New York  372  317    320       430     430      418      327
## 2      Los Angeles  227  252    245       220     219      201      195
## 3          Chicago  311  299    267       293     307      279      219
## 4           Dallas  148  145    147       157     163      161      153
## 5     Philadelphia  244  200    198       247     241      238      206
## 6          Houston  146  149    143       150     154      156      153
## 7  Washington D.C.  321  282    238       291     346      321      263
## 8            Miami  153  153    148       147     182      193      219
## 9          Atlanta  172  198    169       185     184      179      172
## 10          Boston  410  389    377       413     454      408      314
## 11   San Francisco  311  324    341       348     350      307      260
## 12         Detroit  168  172    166       166     165      169      167
## 13         Orlando  138  147    130       126     139      130      133
## 14       San Diego  193  237    207       180     180      149      144
## 15       Las Vegas  132  133    137       158     172      185      170
## 16         Phoenix  108  101     98       117     136      132      140
## 17    Indianapolis  134  139    144       131     134      127      117
## 18  Salt Lake City  137  134    152       146     145      141      134
## 19          Denver  209  212    204       209     203      197      186
## 20     New Orleans  199  195    172       177     234      219      209
## 21       St. Louis  159  163    154       165     164      153      137
## 22         Seattle  319  321    328       272     228      185      172
## 23     Minneapolis  237  214    215       207     257      232      198
## 24     San Antonio  134  171    138       127     127      107      108
## 25        San Jose  205  206    211       211     209      192      177

```

Future Perspective

Time is tight! If I find more time, I’ll do the following.

  1. Using Similar tHPI Datasets
  1. Collecting new Data
  1. Feedback loop - We should be careful as prices are determined in a negotiating process. By making predictions of future prices, and influencing this negotiation pattern we might witness new patterns.